package com.mano.web.webmagic;

import com.mano.web.domain.Journal;
import com.mano.web.domain.JournalPage;
import com.mano.web.util.DateUtils;
import org.apache.commons.lang3.StringUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.stereotype.Component;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.processor.PageProcessor;
import us.codecraft.webmagic.selector.Selectable;

import java.util.ArrayList;
import java.util.Date;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

/**
 * @Author: zj
 * @Description:
 * @Date: Created in 14:00 2020/9/3
 * @Modified By:
 */
@Component
public class XpaperZgtcbProcessor implements PageProcessor {

    private static Logger logger = LoggerFactory.getLogger(XpaperZgtcbProcessor.class);

    // 正则表达式\\. \\转义java中的\ \.转义正则中的.
    // 主域名
    public static final String BASE_URL = "http://i.xpaper.net/cnsports";

    // 获取随机配置agent
    private String agent = Agent.getRandom();

    private Site site = Site.me()
            .setDomain(BASE_URL)
            .setSleepTime(1000)
            .setRetryTimes(30)
            .setCharset("utf-8")
            .setTimeOut(30000)
            .setUserAgent(agent);

    @Override
    public void process(Page page) {
        if(page.getUrl().regex(BASE_URL).match()){
            String contentTitle = page.getHtml().xpath("//title/text()").toString();

            /**
             * ontentTitle:中国体彩报 - 第1151期 - 第01版 - A1
             * issue: 1151
             * issueDesc:中国体彩报 - 第1151期
             */
            String[] contentTitles = contentTitle.trim().split("-");
            String issueStr = contentTitles[1].replaceAll("第", "").replaceAll("期", "").replaceAll(" ", "").trim().replaceAll("\\s*", "");
            String issue = new String(issueStr);

            //由于里面有空格，因此使用了多种方式去空格。
            Pattern p = Pattern.compile("\\s*|\t|\r|\n");
            Matcher m = p.matcher(issue);
            issue = m.replaceAll("");
            issue = issue.replaceAll("\\u00A0","");

            String issueDesc = contentTitles[0] + "-" + contentTitles[1];

            Journal journal = new Journal();
            journal.setTitle(issueDesc);
            journal.setTitleDesc(contentTitle);
            journal.setIssue(issue);
            journal.setDate(DateUtils.INSTANCE.getCurrentDate());
            journal.setDateStr(DateUtils.INSTANCE.getCuttentDate());
            journal.setType((short) 1);
            journal.setStatus((short) 1);
            journal.setGrabDate(new Date());
            journal.setCreatedAt(new Date());
            journal.setUpdatedAt(new Date());

            logger.info("期刊数据:" + journal.toString());

            List<Selectable> list = page.getHtml().xpath("//div[@id='m1']/a").nodes();

            if(list != null && list.size() > 0){
                List<JournalPage> journalPages = new ArrayList<JournalPage>();
                for(int i = 0; i < list.size(); i++){
                    Selectable selectable = list.get(i);

                    String link = selectable.links().toString();
                    String titleStr = selectable.xpath("//b/text()").toString();
                    if(StringUtils.isBlank(titleStr)){
                        titleStr = selectable.toString().split(">")[1].replaceAll("</a","").replaceAll(" ","").replaceAll("&nbsp;", " ");
                    }

                    String title = new String(titleStr);
                    Pattern pp = Pattern.compile("\\s*|\t|\r|\n");
                    Matcher ma = pp.matcher(title);
                    title = ma.replaceAll("");
                    title = title.replaceAll("\\u00A0","");
                    title= title.replaceAll("版", "版 ");


                    /**
                     * <a href="http://i.xpaper.net/cnsports/release/539/2040.shtml"> <b>第01版 &nbsp; A1</b> </a>
                     *
                     * link:http://i.xpaper.net/cnsports/release/539/2040.shtml
                     *
                     * title:第01版   A1
                     */

                    if(StringUtils.isNotBlank(title) && StringUtils.isNotBlank(link)){
                        if(i == 0){
                            journal.setUrl(link);
                            journal.setStageDesc(title);
                        }

                        JournalPage  journalPage = new JournalPage();
                        journalPage.setJournalId(journal.getId());
                        journalPage.setPageHtmlTitle(title);
                        journalPage.setPageHtmlUrl(link);
                        journalPage.setStatus((short) 1);
                        journalPage.setGrabDate(new Date());
                        journalPage.setCreatedAt(new Date());
                        journalPage.setUpdatedAt(new Date());

                        logger.info("版面数据：" + journalPage.toString());

                        journalPages.add(journalPage);
                    }
                }

                journal.setJournalPages(journalPages);
                logger.info("journal.toString():" + journal.toString());
            }
            page.putField("journal",journal);
        }
    }

    @Override
    public Site getSite() {
        return site;
    }

    public static void main(String[] args) {
        Spider spider = Spider.create(new XpaperZgtcbProcessor());
        spider.addUrl(BASE_URL);
        spider.addPipeline(new XpaperZgtcbPopeline());
        spider.thread(1);
        spider.setExitWhenComplete(true);
        spider.start();
        spider.stop();
    }
}
